The dataset of choice for Project One was originally constructed by the World Health Organization (WHO) to track the health status of 193 countries across the world. The question we sought to answer is as follows:
What factors affect life expectancy in individuals across the world?
The dataset includes a variety of factors that contribute to the overall health status of a country. We wanted to know what effect these factors had specifically on life expectancy. An explanation of the different factors in this dataset are as follows:
• Country: Country
• Year: Year
• Status: Developed or Developing status
• Life expectancy: Life Expectancy in age
• Adult Mortality: Adult Mortality Rates of both sexes (probability of dying between 15 and 60 years per 1000 population)
• Infant deaths: Number of Infant Deaths per 1000 population
• Alcohol: Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol)
• Percentage expenditure: Expenditure on health as a percentage of GDP per capita(%)
• Hepatitis B: HepB immunization coverage among 1-year-olds (%)
• Measles: Number of reported measles cases per 1000 population
• BMI: Average Body Mass Index of entire population
• Under-five deaths: Number of under-five deaths per 1000 population
• Polio: Pol3 immunization coverage among 1-year-olds (%)
• Total expenditure: General government expenditure on health as a percentage of total government expenditure (%)
• Diphtheria: DTP3 immunization coverage among 1-year-olds (%)
• HIV/AIDS: Deaths per 1 000 live births HIV/AIDS (0-4 years)
• GDP: Gross Domestic Product per capita (in USD)
• Population: Population of the country
• Thinness 10-19 years: Prevalence of thinness among children/adolescents, age 10 - 19(%)
• Thinness 5-9 years: Prevalence of thinness among children, age 5 to 9 (%)
• Income comp of resources: HDI in terms of income composition of resources (index ranging from 0 to 1)
• Schooling: Number of years of Schooling (years)
Data Source: Kaggle
life2 <- na.omit(life) #omit missing values
loadPkg("ggplot2")
ggplot(life, aes(x=Life.expectancy) )+
geom_histogram(color="darkblue",fill="lightblue")+
ggtitle("Life Expectancy Histogram")+
xlab("Life Expectancy (Age)")
qqnorm(life$Life.expectancy, main="Life Expectancy Q-Q Plot", ylab="Life Expectancy (Age)")
#ggplot(life2, aes(x=Total.expenditure, y=Life.expectancy, fill=Total.expenditure, group = 1)) + geom_boxplot() + scale_fill_brewer(palette="Spectral") + ggtitle("Life Expectancy vs. Total Expenditure") + ylab("Life Expectancy") + xlab("Total Expenditure ($)")
plot(life$Life.expectancy, life$Total.expenditure, main=" Govt Expenditure on Healthcare(y) vs Life Expectancy (x) ",
xlab="Life Expectancy ", ylab="% Healthcare Expenditure (out of total spending) ", pch=19) +
abline(lm(life$Total.expenditure~life$Life.expectancy), col="red") # regression line (y~x)
ggplot(life, aes(x=factor(Year), y=Life.expectancy))+
geom_boxplot() +
facet_wrap(~Status) +
theme(axis.text.x = element_text(angle = 90))
#sapply(life, mean, na.rm=TRUE) # excluding missing values
#sapply(life, sd)
summary(life)
A correlation plot of the different variables is printed below. Some notable takeways are listed below:
• ‘Adult Mortality’ has the strongest overall correlation with ‘Life Expectancy’ (roughly -0.70).
• ‘Income Composition of Resources’ and ‘Schooling’ have strong positive correlations with ‘Life Expectancy’ (roughly 0.75).
• ‘HIV/AIDS’ has a relatively strong negative correlation with ‘Life Expectancy’ (roughly 0.55).
• ‘Measles’ and ‘Population’ have almost no correlation with ‘Life Expectancy’ (roughly 0).
library(dplyr)
sapply(life, class) #look at the class of each variables
life_nofactor = select(life, -c(Country, Status)) #remove the factor variables
cor_life=cor(na.omit(life_nofactor)) #create correlation matrix
library(corrplot)
corrplot(cor_life, type="lower") #plot correlation matrix
ggplot(life, aes(x=GDP, y=percentage.expenditure)) +
geom_point() +
geom_smooth()
ggplot(life, aes(x=HIV.AIDS, y=Life.expectancy)) +
geom_point() +
geom_smooth()
ggplot(life, aes(x=Schooling, y=Life.expectancy)) +
geom_point() +
geom_smooth()
#summary(life)
life_developed=na.omit(subset(life, Status=='Developed')) #subset of countries that are Developed
life_developing=na.omit(subset(life, Status=='Developing')) #subset of countries that are Developing
#Life expectancy of Developed/Developing are different as p<<0.05
mean_life_developed = mean(life_developed$Life.expectancy)
mean_life_developing = mean(life_developing$Life.expectancy)
t_life_developed = t.test(x=life_developed$Life.expectancy, conf.level=0.95 )
t_life_developing= t.test(x=life_developing$Life.expectancy, conf.level=0.95 )
t_life_developed$conf.int
t_life_developing$conf.int
mean_life_developed
mean_life_developing
#Use Chi-squared test to determine if Life Expectancy is independent of Developed/Developing
contable = table(life$Life.expectancy, life$Status)
chitest = chisq.test(contable)
chitest #Not independent as p<<0.05
#make overlaying histograms of the two subgroups aes(x = rank, y = gpa, fill = admit)
# First distribution
hist(life_developing$Life.expectancy, col=rgb(1,0,0,0.5), xlab="Life Expectancy",
ylab="Count", main="Life Expectancy of Developed vs Developing Countries-Absolute Count" )
# Second with add=T to plot on top
hist(life_developed$Life.expectancy, col=rgb(0,0,1,0.5), add=T)
# Add legend
legend("topright", legend=c("Developing","Developed"), col=c(rgb(1,0,0,0.5),
rgb(0,0,1,0.5)), pt.cex=2, pch=15 )
#Histogram Plot of Developed vs Developing using relative frequency
#make overlaying histograms of the two subgroups aes(x = rank, y = gpa, fill = admit)
# First distribution
hist(life_developing$Life.expectancy, col=rgb(1,0,0,0.5), xlab="Life Expectancy",
ylab="Proportion", main="Life Expectancy of Developed vs Developing Countries-Relative Frequencies" , freq=F,
ylim = c(0,0.15))
# Second with add=T to plot on top
hist(life_developed$Life.expectancy, col=rgb(0,0,1,0.5), add=T, freq=F)
# Add legend
legend("topright", legend=c("Developing","Developed"), col=c(rgb(1,0,0,0.5),
rgb(0,0,1,0.5)), pt.cex=2, pch=15 )
#Variables most highly correlated to life expectacy were Income.composition.of.resources and Schooling. Though BMI and Aids had highish correlations. Adult mortality was highly correlated with life expectancy, but the reasons are obvious, so they will not be evaluated here.
#These all compare the differences in means between the developing and developed countries
developed_income= mean(life_developed$Income.composition.of.resources)
developing_income=mean(life_developing$Income.composition.of.resources)
developed_school=mean(life_developed$Schooling)
developing_school=mean(life_developing$Schooling)
developed_BMI=mean(life_developed$BMI)
developing_BMI=mean(life_developing$BMI)
developed_AIDS=mean(life_developed$HIV.AIDS)
developing_AIDS=mean(life_developing$HIV.AIDS)
developed_income
developing_income
developed_school
developing_school
developed_BMI
developing_BMI
developed_AIDS
developing_AIDS
ggplot(life, aes(x=Year, y=Life.expectancy, col=Status))+
geom_point() +
geom_smooth() +
facet_wrap(~Status)
data <- read.csv("lifeexpectancydata.csv")
data_world_map <- data.frame(data)
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'USA')
data_world_map$Country[data_world_map$Country == 'United States of America'] <- 'USA'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'UK')
data_world_map$Country[data_world_map$Country == 'United Kingdom of Great Britain and Northern Ireland'] <- 'UK'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Russia')
data_world_map$Country[data_world_map$Country == 'Russian Federation'] <- 'Russia'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Bolivia')
data_world_map$Country[data_world_map$Country == 'Bolivia (Plurinational State of)'] <- 'Bolivia'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Brunei')
data_world_map$Country[data_world_map$Country == 'Brunei Darussalam'] <- 'Brunei'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Czech Republic')
data_world_map$Country[data_world_map$Country == 'Czechia'] <- 'Czech Republic'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'North Korea')
data_world_map$Country[data_world_map$Country == 'Democratic People\'s Republic of Korea'] <- 'North Korea'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Iran')
data_world_map$Country[data_world_map$Country == 'Iran (Islamic Republic of)'] <- 'Iran'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Laos')
data_world_map$Country[data_world_map$Country == 'Lao People\'s Democratic Republic'] <- 'Laos'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Micronesia')
data_world_map$Country[data_world_map$Country == 'Micronesia (Federated States of)'] <- 'Micronesia'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'South Korea')
data_world_map$Country[data_world_map$Country == 'Republic of Korea'] <- 'South Korea'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Moldova')
data_world_map$Country[data_world_map$Country == 'Republic of Moldova'] <- 'Moldova'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Saint Vincent')
data_world_map$Country[data_world_map$Country == 'Saint Vincent and the Grenadines'] <- 'Saint Vincent'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Syria')
data_world_map$Country[data_world_map$Country == 'Syrian Arab Republic'] <- 'Syria'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Macedonia')
data_world_map$Country[data_world_map$Country == 'The former Yugoslav republic of Macedonia'] <- 'Macedonia'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Tanzania')
data_world_map$Country[data_world_map$Country == 'United Republic of Tanzania'] <- 'Tanzania'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Venezuela')
data_world_map$Country[data_world_map$Country == 'Venezuela (Bolivarian Republic of)'] <- 'Venezuela'
levels(data_world_map$Country) <- c(levels(data_world_map$Country), 'Vietnam')
data_world_map$Country[data_world_map$Country == 'Viet Nam'] <- 'Vietnam'
two_thousand <- subset(x=data_world_map, data_world_map$Year==2000)
loadPkg("ggmap")
loadPkg("tidyverse")
loadPkg("dplyr")
world_data <- map_data('world')
combined_2000 <- world_data[two_thousand$Country %in% two_thousand$Country, ]
combined_2000$value <- two_thousand$Life.expectancy[match(combined_2000$region, two_thousand$Country)]
countries <- unique(combined_2000$Country)
cdf <- data.frame(label1=countries)
for(i in cdf){
combined_2000$value <- ifelse(combined_2000$Country %in% cdf$label1[i], (two_thousand$Life.expectancy), combined_2000$value)
}
ggplot(data=combined_2000, aes(x=long, y=lat, group=group, fill=value)) +
geom_polygon(colour="white") +
scale_fill_continuous(low="blue",
high="orange",
guide="colorbar") +
theme_bw() +
labs(fill="Life Expectancy", title="Life Expectancy Across the World (2000)", x="", y="") +
scale_y_continuous(breaks=c()) +
scale_x_continuous(breaks=c()) +
theme(panel.border=element_blank())
loadPkg("ggmap")
loadPkg("tidyverse")
loadPkg("dplyr")
two_thousand_five <- subset(x=data_world_map, data_world_map$Year==2005)
combined_2005 <- world_data[two_thousand_five$Country %in% two_thousand_five$Country, ]
combined_2005$value <- two_thousand_five$Life.expectancy[match(combined_2005$region, two_thousand_five$Country)]
countries <- unique(combined_2005$Country)
cdf <- data.frame(label1=countries)
for(i in cdf){
combined_2005$value <- ifelse(combined_2005$Country %in% cdf$label1[i], (two_thousand_five$Life.expectancy), combined_2005$value)
}
ggplot(data=combined_2005, aes(x=long, y=lat, group=group, fill=value)) +
geom_polygon(colour="white") +
scale_fill_continuous(low="blue",
high="orange",
guide="colorbar") +
theme_bw() +
labs(fill="Life Expectancy", title="Life Expectancy Across the World (2005)", x="", y="") +
scale_y_continuous(breaks=c()) +
scale_x_continuous(breaks=c()) +
theme(panel.border=element_blank())
loadPkg("ggmap")
loadPkg("tidyverse")
loadPkg("dplyr")
two_thousand_ten <- subset(x=data_world_map, data_world_map$Year==2010)
combined_2010 <- world_data[two_thousand_ten$Country %in% two_thousand_ten$Country, ]
combined_2010$value <- two_thousand_ten$Life.expectancy[match(combined_2010$region, two_thousand_ten$Country)]
countries <- unique(combined_2010$Country)
cdf <- data.frame(label1=countries)
for(i in cdf){
combined_2010$value <- ifelse(combined_2010$Country %in% cdf$label1[i], (two_thousand_ten$Life.expectancy), combined_2010$value)
}
ggplot(data=combined_2010, aes(x=long, y=lat, group=group, fill=value)) +
geom_polygon(colour="white") +
scale_fill_continuous(low="blue",
high="orange",
guide="colorbar") +
theme_bw() +
labs(fill="Life Expectancy", title="Life Expectancy Across the World (2010)", x="", y="") +
scale_y_continuous(breaks=c()) +
scale_x_continuous(breaks=c()) +
theme(panel.border=element_blank())
# Combining and plotting LE data from the year 2015
loadPkg("ggmap")
loadPkg("tidyverse")
loadPkg("dplyr")
two_thousand_fifteen <- subset(x=data_world_map, data_world_map$Year==2015)
combined_2015 <- world_data[two_thousand_fifteen$Country %in% two_thousand_fifteen$Country, ]
combined_2015$value <- two_thousand_fifteen$Life.expectancy[match(combined_2015$region, two_thousand_fifteen$Country)]
countries <- unique(combined_2015$Country)
cdf <- data.frame(label1=countries)
for(i in cdf){
combined_2015$value <- ifelse(combined_2015$Country %in% cdf$label1[i], (two_thousand_fifteen$Life.expectancy), combined_2015$value)
}
ggplot(data=combined_2015, aes(x=long, y=lat, group=group, fill=value)) +
geom_polygon(colour="white") +
scale_fill_continuous(low="blue",
high="orange",
guide="colorbar") +
theme_bw() +
labs(fill="Life Expectancy", title="Life Expectancy Across the World (2015)", x="", y="") +
scale_y_continuous(breaks=c()) +
scale_x_continuous(breaks=c()) +
theme(panel.border=element_blank())
loadPkg("dplyr")
data_le_means <- data.frame(data)
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'USA')
data_le_means$Country[data_le_means$Country == 'United States of America'] <- 'USA'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'UK')
data_le_means$Country[data_le_means$Country == 'United Kingdom of Great Britain and Northern Ireland'] <- 'UK'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Russia')
data_le_means$Country[data_le_means$Country == 'Russian Federation'] <- 'Russia'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Bolivia')
data_le_means$Country[data_le_means$Country == 'Bolivia (Plurinational State of)'] <- 'Bolivia'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Brunei')
data_le_means$Country[data_le_means$Country == 'Brunei Darussalam'] <- 'Brunei'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Czech Republic')
data_le_means$Country[data_le_means$Country == 'Czechia'] <- 'Czech Republic'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'North Korea')
data_le_means$Country[data_le_means$Country == 'Democratic People\'s Republic of Korea'] <- 'North Korea'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Iran')
data_le_means$Country[data_le_means$Country == 'Iran (Islamic Republic of)'] <- 'Iran'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Laos')
data_le_means$Country[data_le_means$Country == 'Lao People\'s Democratic Republic'] <- 'Laos'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Micronesia')
data_le_means$Country[data_le_means$Country == 'Micronesia (Federated States of)'] <- 'Micronesia'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'South Korea')
data_le_means$Country[data_le_means$Country == 'Republic of Korea'] <- 'South Korea'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Moldova')
data_le_means$Country[data_le_means$Country == 'Republic of Moldova'] <- 'Moldova'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Saint Vincent')
data_le_means$Country[data_le_means$Country == 'Saint Vincent and the Grenadines'] <- 'Saint Vincent'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Syria')
data_le_means$Country[data_le_means$Country == 'Syrian Arab Republic'] <- 'Syria'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Macedonia')
data_le_means$Country[data_le_means$Country == 'The former Yugoslav republic of Macedonia'] <- 'Macedonia'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Tanzania')
data_le_means$Country[data_le_means$Country == 'United Republic of Tanzania'] <- 'Tanzania'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Venezuela')
data_le_means$Country[data_le_means$Country == 'Venezuela (Bolivarian Republic of)'] <- 'Venezuela'
levels(data_le_means$Country) <- c(levels(data_le_means$Country), 'Vietnam')
data_le_means$Country[data_le_means$Country == 'Viet Nam'] <- 'Vietnam'
le_na = subset(data_le_means, is.na(Life.expectancy))
le_clean = anti_join(data_le_means, le_na)
country_and_status <- le_clean %>% group_by(Country, Status) %>% summarise(mean_le = mean(Life.expectancy))
loadPkg("ggmap")
loadPkg("tidyverse")
loadPkg("dplyr")
world_data <- map_data('world')
combined_means <- world_data[country_and_status$Country %in% country_and_status$Country, ]
combined_means$value <- country_and_status$mean_le[match(combined_means$region, country_and_status$Country)]
countries <- unique(combined_means$Country)
cdf <- data.frame(label1=countries)
for(i in cdf){
combined_means$value <- ifelse(combined_means$Country %in% cdf$label1[i], (country_and_status$mean_le), combined_means$value)
}
ggplot(data=combined_means, aes(x=long, y=lat, group=group, fill=value)) +
geom_polygon(colour="white") +
scale_fill_continuous(low="blue",
high="orange",
guide="colorbar") +
theme_bw() +
labs(fill="Life Expectancy", title="Mean Life Expectancy Across the World (2000-2015)", x="", y="") +
scale_y_continuous(breaks=c()) +
scale_x_continuous(breaks=c()) +
theme(panel.border=element_blank())